{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "671a2587-c004-4723-9860-80e1f2cb60f7",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_28943/1875617307.py:31: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
      "  train_feat = [extract_feature_from_pdf(x) for x in tqdm_notebook(train_paths)]\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "beb2ffbed34049a189c74fa5d914972b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/815 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import PyPDF2\n",
    "import glob \n",
    "from tqdm import tqdm_notebook\n",
    "import numpy as np\n",
    "\n",
    "def extract_feature_from_pdf(path):\n",
    "    reader = PyPDF2.PdfReader(path)\n",
    "    text = '\\n'.join([reader.pages[idx].extract_text() for idx in range(len(reader.pages))])\n",
    "    feat = [\n",
    "        len(reader.pages),\n",
    "        len(text),\n",
    "        np.mean([len(x) for x in text.split('\\n')]),\n",
    "        np.max([len(x) for x in text.split('\\n')]),\n",
    "        np.std([len(x) for x in text.split('\\n')]),\n",
    "\n",
    "        len(set(text)),\n",
    "        len(text) - len(set(text)),\n",
    "        len(set(text)) / (len(text) + 1),\n",
    "        \n",
    "        len(text.split()),\n",
    "        len(text.split('\\n')),\n",
    "        text.count('x'),\n",
    "        text.count('xxx'),\n",
    "        sum([text.count(x) for x in '0123456789']),\n",
    "        text.count('@'),\n",
    "    ]\n",
    "    return feat\n",
    "\n",
    "train_paths = glob.glob('./校招简历信息完整性检测训练集/*/*.pdf')\n",
    "train_label = ['正样本' in x for x in train_paths]\n",
    "train_feat = [extract_feature_from_pdf(x) for x in tqdm_notebook(train_paths)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "0e98d1cd-2239-4888-afa0-4dca88bf4e8f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from lightgbm import LGBMClassifier\n",
    "\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "87ea087d-e7b3-4dae-94b6-e036c8a81e95",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Info] Number of positive: 559, number of negative: 93\n",
      "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000485 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 1658\n",
      "[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.857362 -> initscore=1.793550\n",
      "[LightGBM] [Info] Start training from score 1.793550\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Info] Number of positive: 559, number of negative: 93\n",
      "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 1653\n",
      "[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.857362 -> initscore=1.793550\n",
      "[LightGBM] [Info] Start training from score 1.793550\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Info] Number of positive: 558, number of negative: 94\n",
      "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 1664\n",
      "[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.855828 -> initscore=1.781064\n",
      "[LightGBM] [Info] Start training from score 1.781064\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Info] Number of positive: 558, number of negative: 94\n",
      "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 1655\n",
      "[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.855828 -> initscore=1.781064\n",
      "[LightGBM] [Info] Start training from score 1.781064\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Info] Number of positive: 558, number of negative: 94\n",
      "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 1664\n",
      "[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 14\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.855828 -> initscore=1.781064\n",
      "[LightGBM] [Info] Start training from score 1.781064\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "       False     0.6000    0.4359    0.5050       117\n",
      "        True     0.9096    0.9513    0.9300       698\n",
      "\n",
      "    accuracy                         0.8773       815\n",
      "   macro avg     0.7548    0.6936    0.7175       815\n",
      "weighted avg     0.8651    0.8773    0.8690       815\n",
      "\n"
     ]
    }
   ],
   "source": [
    "val_pred = cross_val_predict(\n",
    "    LGBMClassifier(),\n",
    "    np.array(train_feat),\n",
    "    train_label\n",
    ")\n",
    "\n",
    "print(\n",
    "    classification_report(train_label, val_pred, digits=4)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "9cd1e2be-75c2-4a4b-b8e1-84f08002a3d6",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Info] Number of positive: 698, number of negative: 117\n",
      "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000293 seconds.\n",
      "You can set `force_col_wise=true` to remove the overhead.\n",
      "[LightGBM] [Info] Total Bins 1912\n",
      "[LightGBM] [Info] Number of data points in the train set: 815, number of used features: 14\n",
      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.856442 -> initscore=1.786045\n",
      "[LightGBM] [Info] Start training from score 1.786045\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
      "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['lgb.pkl']"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m = LGBMClassifier().fit(\n",
    "    np.array(train_feat),\n",
    "    train_label\n",
    ")\n",
    "import joblib\n",
    "joblib.dump(m, 'lgb.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "18408c37-62a8-4cef-aa73-e766d6a0109b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-4 {color: black;background-color: white;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LGBMClassifier()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" checked><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LGBMClassifier</label><div class=\"sk-toggleable__content\"><pre>LGBMClassifier()</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LGBMClassifier()"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.load('lgb.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbb0bac3-05b6-46db-86fd-c5f66d56165e",
   "metadata": {},
   "source": [
    "如下为run.py内容\n",
    "\n",
    "```python\n",
    "import PyPDF2\n",
    "import glob \n",
    "import numpy as np\n",
    "import joblib\n",
    "import pandas as pd\n",
    "from lightgbm import LGBMClassifier\n",
    "\n",
    "def extract_feature_from_pdf(path):\n",
    "    reader = PyPDF2.PdfReader(path)\n",
    "    text = '\\n'.join([reader.pages[idx].extract_text() for idx in range(len(reader.pages))])\n",
    "    feat = [\n",
    "        len(reader.pages),\n",
    "        len(text),\n",
    "        np.mean([len(x) for x in text.split('\\n')]),\n",
    "        np.max([len(x) for x in text.split('\\n')]),\n",
    "        np.std([len(x) for x in text.split('\\n')]),\n",
    "\n",
    "        len(set(text)),\n",
    "        len(text) - len(set(text)),\n",
    "        len(set(text)) / (len(text) + 1),\n",
    "        \n",
    "        len(text.split()),\n",
    "        len(text.split('\\n')),\n",
    "        text.count('x'),\n",
    "        text.count('xxx'),\n",
    "        sum([text.count(x) for x in '0123456789']),\n",
    "        text.count('@'),\n",
    "    ]\n",
    "    return feat\n",
    "\n",
    "test_paths = glob.glob('/work/data/integrity-check-of-resume-test-set/*.pdf')[:]\n",
    "test_feat = [extract_feature_from_pdf(x) for x in test_paths]\n",
    "\n",
    "m = joblib.load('lgb.pkl')\n",
    "\n",
    "pd.DataFrame({\n",
    "    'ResumeID': [x.split('/')[-1] for x in test_paths],\n",
    "    'label': m.predict(test_feat).astype(int)\n",
    "}).to_csv('/work/output/result.csv', index=None)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "fd02c481-c0dd-44a1-b6ae-75c59aa5bc78",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "information-integrity/\n",
      "information-integrity/lgb.pkl\n",
      "information-integrity/.ipynb_checkpoints/\n",
      "information-integrity/.ipynb_checkpoints/run-checkpoint.py\n",
      "information-integrity/run.py\n"
     ]
    }
   ],
   "source": [
    "!tar -cvzf information-integrity.tar.gz information-integrity/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "6a5a549f-0bc3-4a65-ade6-132dc3057988",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "upload: 'information-integrity.tar.gz' -> 's3://ai-competition/0fs76epw/information-integrity.tar.gz'  [1 of 1]\n",
      " 134511 of 134511   100% in    0s  1074.60 kB/s  done\n"
     ]
    }
   ],
   "source": [
    "!s3cmd put information-integrity.tar.gz s3://ai-competition/0fs76epw/information-integrity.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dabf9204-b512-45ed-969a-bf479cddab11",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {
     "13c94a8965d246bbb97911a651a513bd": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "17b221f63a0842298f4e7344e321bf42": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "1821cd0a580c4c69bb30daa1629ae79a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_98db557092ff49a3bb11f609fce8ef28",
       "style": "IPY_MODEL_dd3277efac1f4cf693dc98f270acba26",
       "value": " 815/815 [00:54&lt;00:00, 17.93it/s]"
      }
     },
     "20a1983fa0344531ac33cc4163f1b922": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "layout": "IPY_MODEL_94a4bec88ff143d3a201bdf3e3de387d",
       "max": 815,
       "style": "IPY_MODEL_ded1588d11e24526958ca93dfa916a72",
       "value": 84
      }
     },
     "21b9e8a973b94f6fa6d7edb331cb888b": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "26e6465324464c9cad70df890d106599": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "danger",
       "layout": "IPY_MODEL_46e4a12f3f7141388267f5c8f13b0856",
       "max": 815,
       "style": "IPY_MODEL_cb4e0858649f4095a31f4dfbd81ff521",
       "value": 71
      }
     },
     "29797d76831f43ad92f83435b5ba2c04": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "2c802c2757904935972d9092928b770c": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "30f24b4e61204befbc33f78cf46bbb41": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "41c66351c35e48759410dffad2b10ee2": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "424a9dbd067a4a6a9e2c81830dbb2702": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_cb2e34a863294e1a804b5172d855b7f3",
       "style": "IPY_MODEL_17b221f63a0842298f4e7344e321bf42",
       "value": " 71/815 [05:40&lt;00:42, 17.41it/s]"
      }
     },
     "4303ae405e03459ba5c50445f6a1051d": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_975297ceba334355820db81cd9f6ac5b",
        "IPY_MODEL_87fd77ddc24a4d69829270019fb1d2f5",
        "IPY_MODEL_826d4e58f7574ee1bcfd5841a64e7a81"
       ],
       "layout": "IPY_MODEL_de3b7a5c3b9040a68969cbc54896e07f"
      }
     },
     "46e4a12f3f7141388267f5c8f13b0856": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "4871716919bc4c4d8f61b5c92b752c08": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_6b6c790f0e8040b8ab381f94d80d935f",
        "IPY_MODEL_20a1983fa0344531ac33cc4163f1b922",
        "IPY_MODEL_e34ddbd8cd004bd497c643e4bd06752a"
       ],
       "layout": "IPY_MODEL_e2d0dd7f2ff546e9acfb223c23e69ebc"
      }
     },
     "54fc7f16325646f5b302a9617858a724": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_866c86c8ef4f4a9d95dcc98106bc64d8",
       "style": "IPY_MODEL_8cf03b192a2645fbbf978ab9044babdb",
       "value": "100%"
      }
     },
     "598661c29c8a419ea24ae0a94b4cde14": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_f723c3091cf74344bd9876849f5e389f",
       "style": "IPY_MODEL_ca87e48b9ce64689a377656a5d4b6e03",
       "value": "  9%"
      }
     },
     "699bc8732925457ba7cf12c73ef601d5": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_30f24b4e61204befbc33f78cf46bbb41",
       "style": "IPY_MODEL_8a8eecefccac4c6cad4ba1590b9a1932",
       "value": "100%"
      }
     },
     "6b6c790f0e8040b8ab381f94d80d935f": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_a0187db643554749a74be9d95404a2ad",
       "style": "IPY_MODEL_ae940e494fe641a3b2104f02eafbaf93",
       "value": " 10%"
      }
     },
     "7119f9f40e3147dc9d70f0954718b6c9": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_598661c29c8a419ea24ae0a94b4cde14",
        "IPY_MODEL_26e6465324464c9cad70df890d106599",
        "IPY_MODEL_424a9dbd067a4a6a9e2c81830dbb2702"
       ],
       "layout": "IPY_MODEL_bd0b25217d4d4647b43318aec7412929"
      }
     },
     "713fd515671244eea40bb8523a583689": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_8c0435d940f149c0b131c0c49c2e014b",
       "max": 815,
       "style": "IPY_MODEL_21b9e8a973b94f6fa6d7edb331cb888b",
       "value": 815
      }
     },
     "7d7f61a96aa04e6dbd6c70e816f17687": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "826d4e58f7574ee1bcfd5841a64e7a81": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_dc30e1111f814be2b94208a9ed0be25d",
       "style": "IPY_MODEL_a554c465f4e94cd7a328118e2e9244e0",
       "value": " 631/815 [01:10&lt;00:14, 12.55it/s]"
      }
     },
     "866c86c8ef4f4a9d95dcc98106bc64d8": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "87fd77ddc24a4d69829270019fb1d2f5": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "danger",
       "layout": "IPY_MODEL_8d91bc9097194d539221b6c8efb32424",
       "max": 815,
       "style": "IPY_MODEL_41c66351c35e48759410dffad2b10ee2",
       "value": 631
      }
     },
     "8a5191b8c58c43478e49c1ca4893ff25": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_54fc7f16325646f5b302a9617858a724",
        "IPY_MODEL_c812456379eb4f4da6f0d25234ed442a",
        "IPY_MODEL_1821cd0a580c4c69bb30daa1629ae79a"
       ],
       "layout": "IPY_MODEL_2c802c2757904935972d9092928b770c"
      }
     },
     "8a8eecefccac4c6cad4ba1590b9a1932": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "8c0435d940f149c0b131c0c49c2e014b": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "8cf03b192a2645fbbf978ab9044babdb": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "8d91bc9097194d539221b6c8efb32424": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "94a4bec88ff143d3a201bdf3e3de387d": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "975297ceba334355820db81cd9f6ac5b": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_d73e08d4064b46a698d9b04d939c5580",
       "style": "IPY_MODEL_13c94a8965d246bbb97911a651a513bd",
       "value": " 77%"
      }
     },
     "98db557092ff49a3bb11f609fce8ef28": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "a0187db643554749a74be9d95404a2ad": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "a51b34bc250f49e3801e128f4794777c": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_e06af076bcee4199bd8af06386016f40",
       "style": "IPY_MODEL_e9312088a6a042f98c57ad5ff26987a7",
       "value": " 815/815 [00:57&lt;00:00, 16.02it/s]"
      }
     },
     "a554c465f4e94cd7a328118e2e9244e0": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "a80ccb4469394d7cb3b8ad40b938d37e": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "ae940e494fe641a3b2104f02eafbaf93": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "bd0b25217d4d4647b43318aec7412929": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "beb2ffbed34049a189c74fa5d914972b": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_699bc8732925457ba7cf12c73ef601d5",
        "IPY_MODEL_713fd515671244eea40bb8523a583689",
        "IPY_MODEL_a51b34bc250f49e3801e128f4794777c"
       ],
       "layout": "IPY_MODEL_29797d76831f43ad92f83435b5ba2c04"
      }
     },
     "c812456379eb4f4da6f0d25234ed442a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_d64b6cfbe236441da432c1c6936c0112",
       "max": 815,
       "style": "IPY_MODEL_f51429affe114d1f9a90a0b3781b6932",
       "value": 815
      }
     },
     "ca87e48b9ce64689a377656a5d4b6e03": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "cb2e34a863294e1a804b5172d855b7f3": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "cb4e0858649f4095a31f4dfbd81ff521": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "d64b6cfbe236441da432c1c6936c0112": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "d73e08d4064b46a698d9b04d939c5580": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "dc30e1111f814be2b94208a9ed0be25d": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "dd3277efac1f4cf693dc98f270acba26": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "de3b7a5c3b9040a68969cbc54896e07f": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "ded1588d11e24526958ca93dfa916a72": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "e06af076bcee4199bd8af06386016f40": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "e2d0dd7f2ff546e9acfb223c23e69ebc": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "e34ddbd8cd004bd497c643e4bd06752a": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_a80ccb4469394d7cb3b8ad40b938d37e",
       "style": "IPY_MODEL_7d7f61a96aa04e6dbd6c70e816f17687",
       "value": " 84/815 [00:20&lt;00:38, 18.99it/s]"
      }
     },
     "e9312088a6a042f98c57ad5ff26987a7": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "f51429affe114d1f9a90a0b3781b6932": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "f723c3091cf74344bd9876849f5e389f": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     }
    },
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
